# change this cell to 'Code' if you want run this cell
# the code below is copied from the workshop02
# moving files to corresponding folder after the download, like 2016 files to 'yellow_2016' etc.
import urllib
for month in range(1,13):
fixed_month = f'{month:02}'
fname01 = f'yellow_tripdata_2016-{fixed_month}.csv'
fname02 = f'yellow_tripdata_2019-{fixed_month}.csv'
url01 = f'https://s3.amazonaws.com/nyc-tlc/trip+data/{fname01}'
url02 = f'https://s3.amazonaws.com/nyc-tlc/trip+data/{fname02}'
urllib.request.urlretrieve(url01,fname01)
urllib.request.urlretrieve(url02,fname02)
print(f'Finish Downloading {fname01} & {fname02}')
# import all libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime
# 'months' are list of string with each length is 2, like ['01','02'..]
months = [f'{month:02}' for month in range(1,13)]
# select only needed features
features_19 = ['tpep_pickup_datetime','trip_distance','DOLocationID','tip_amount']
# convert each file to feather type
for month in months:
# read csv file and drop rows which contain NaN value
fname = f'yellow_2019/yellow_tripdata_2019-{month}.csv'
df = pd.read_csv(fname)[features_19].dropna()
# select trips which start in 2019, the reason is shown below
df = df.loc[df['tpep_pickup_datetime'].str[:4]=='2019']
# keep only year/month/day/hour for each record, avoid too much NaNs after joining with 'weather.csv'
df['tpep_pickup_datetime'] = df['tpep_pickup_datetime'].str[:13]
# convert to feather format
df.reset_index().to_feather(f'yellow_2019/yellow_tripdata_2019-{month}.feather')
print(f'Finish converting {fname}')
# the reason for selecting data in exact year is there are some outliers or input mistake in the dataset
incorrect_csv01 = pd.read_csv('yellow_2019/yellow_tripdata_2019-01.csv')[features_19].dropna()
# for example, there are some trips which start in 2088, which are not valid
incorrect_csv01.loc[incorrect_csv01['tpep_pickup_datetime'].str[:4]>'2019'].head(2)
# convert yellow_taxi_2016 from first 6 months to feather
# the procedure is similar to 2019 yellow taxi files
features_16 = ['tpep_pickup_datetime','trip_distance','tip_amount']
for month in months[:6]:
fname = f'yellow_2016/yellow_tripdata_2016-{month}.csv'
df = pd.read_csv(fname)[features_16].dropna()
df = df.loc[df['tpep_pickup_datetime'].str[:4]=='2016']
df['tpep_pickup_datetime'] = df['tpep_pickup_datetime'].str[:13]
df.reset_index().to_feather(f'yellow_2016/yellow_tripdata_2016-{month}.feather')
print(f'Finish converting {fname}')
# the format of csv file is invalid after June in 2016
incorrect_csv02 = pd.read_csv('yellow_2016/yellow_tripdata_2016-07.csv')
# the first two columns become indexes and the last two columns are missing
print(f'incorrect index for the csv: {incorrect_csv02.index[0]}')
incorrect_csv02.head(2)
# change the 2016 taxi data after June to valid format and convert to feather type
for month in months[6:]:
# read the file
fname = f'yellow_2016/yellow_tripdata_2016-{month}.csv'
df = pd.read_csv(fname)
# assign valid column names to 'columns', reset indexes, drop last two columns and reassign column names
columns = df.columns
df= df.reset_index()
df = df.drop(df.columns[-2:],axis=1)
df.columns = columns
# feature selection, drop NaN, select rows, remove the minutes and seconds in datetime column
df = df[features_16].dropna()
df = df.loc[df['tpep_pickup_datetime'].str[:4]=='2016']
df['tpep_pickup_datetime'] = df['tpep_pickup_datetime'].str[:13]
# convert csv to feather, print the message
df.reset_index().to_feather(f'yellow_2016/yellow_tripdata_2016-{month}.feather')
print(f'Finish converting {fname}')
# read the weather dataset, don't need to convert to feather because it's relatively small
# fill NaN value with 0, the reason is shown below
weather_csv = pd.read_csv('NYC_weather_dataset_2016_2020.csv')
weather_csv.head(1)
# since both 'rain_1h' and 'snow_1h' doesn't have 0 value, a valid assumption is all NaN means 0
weather_csv.loc[(weather_csv['rain_1h']==0) | (weather_csv['snow_1h']==0)]
weather_csv = weather_csv.fillna(0)
# keep only year, month, day and hour for each record, in order to join with taxi files later
weather_csv['dt_iso'] = weather_csv['dt_iso'].str[:13]
# the dataset contains max_temp & min_temp for each hour, need to make another column using average of these two
weather_csv['avg_temp'] = (weather_csv['temp_max']+weather_csv['temp_min'])/2
# since we focus on all precipitation instead of only rain amount, need to add two columns to get 'tot_prec'
weather_csv['tot_prec'] = weather_csv['rain_1h'] + weather_csv['snow_1h']
# feature selection
weather_features = ['dt_iso','avg_temp','tot_prec']
weather_csv = weather_csv[weather_features]
# weather_19 contains only data in 2019
weather_19 = weather_csv.loc[weather_csv['dt_iso'].str[:4]=='2019']
# weather_16 contains only data in 2016
weather_16 = weather_csv.loc[weather_csv['dt_iso'].str[:4]=='2016']
# annual_prec_16 & annual_prec_19 are lists of total precipitation for each month
# annual_temp_16 & annual_temp_19 are lists of average temperature for each month
annual_prec_16 = [weather_16.loc[weather_16['dt_iso'].str[5:7]==month]['tot_prec'].sum() for month in months]
annual_prec_19 = [weather_19.loc[weather_19['dt_iso'].str[5:7]==month]['tot_prec'].sum() for month in months]
annual_temp_16 = [weather_16.loc[weather_16['dt_iso'].str[5:7]==month]['avg_temp'].mean() for month in months]
annual_temp_19 = [weather_19.loc[weather_19['dt_iso'].str[5:7]==month]['avg_temp'].mean() for month in months]
%%time
# taxi_demand_16 is a list of total number of rows (taxi demand) for each month in 2016
# taxi_demand_19 is a list of total number of rows (taxi demand) for each month in 2019
taxi_demand_16 = []
taxi_demand_19 = []
for month in months:
tot_taxi_16 = pd.read_feather(f'yellow_2016/yellow_tripdata_2016-{month}.feather').drop('index', axis=1).shape[0]
tot_taxi_19 = pd.read_feather(f'yellow_2019/yellow_tripdata_2019-{month}.feather').drop('index', axis=1).shape[0]
taxi_demand_16.append(tot_taxi_16)
taxi_demand_19.append(tot_taxi_19)
# plot the average temperature vs total taxi demand for each month in 2016
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(months, taxi_demand_16, 'k-')
ax2.plot(months, annual_temp_16, 'r--')
ax1.set_xlabel('Months')
ax1.set_ylabel('Total taxi demand 2016', color='k')
ax2.set_ylabel('Annual temperature 2016', color='r')
plt.title('Annual temperature vs Total taxi demand in 2016')
plt.savefig('plots/Annual_temperature_vs_Total_taxi_demand_in_2016.png')
plt.show()
# plot the average temperature vs total taxi demand for each month in 2019
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(months, taxi_demand_19, 'k-')
ax2.plot(months, annual_temp_19, 'r--')
ax1.set_xlabel('Months')
ax1.set_ylabel('Total taxi demand 2019', color='k')
ax2.set_ylabel('Annual temperature 2019', color='r')
plt.title('Annual temperature vs Total taxi demand in 2019')
plt.savefig('plots/Annual_temperature_vs_Total_taxi_demand_in_2019.png')
plt.show()
%%time
# In order to check whether precipitation is a factor for total taxi demand, we use the ratio between taxi demand
# under precipitation and total demand for each month instead of total taxi demand
# The reason is each month has different taxi demand, we want to reduce the influence caused by other factor
# like temperature.
# prec_demand_ratio_16 is the list of ratio for each month
# prec_datetime_16 is a dataframe comes from weather_16 with only datetime and amount of total precipitation columns
# and amount of precipitation is greater than 0
prec_demand_ratio_16 = []
prec_datetime_16 = weather_16.loc[weather_16['tot_prec'] > 0][['dt_iso','tot_prec']]
for month in months:
# read the data and total demand for each month is the first element of shape of dataframe
df = pd.read_feather(f'yellow_2016/yellow_tripdata_2016-{month}.feather').drop('index', axis=1)['tpep_pickup_datetime']
tot_demand = df.shape[0]
# joining the dataframe with weather data by the datetime column
meta_df = pd.merge(df, prec_datetime_16, left_on='tpep_pickup_datetime', right_on='dt_iso').drop('dt_iso',axis=1)
prec_demand = meta_df.shape[0]
# ratio is taxi demand under precipitation divided by total demand for each month, add ratio to list
ratio = prec_demand/tot_demand
prec_demand_ratio_16.append(ratio)
%%time
# the procedure below is same with above except the time is in 2019
prec_demand_ratio_19 = []
prec_datetime_19 = weather_19.loc[weather_19['tot_prec'] > 0][['dt_iso','tot_prec']]
for month in months:
df = pd.read_feather(f'yellow_2019/yellow_tripdata_2019-{month}.feather').drop('index', axis=1)['tpep_pickup_datetime']
tot_demand = df.shape[0]
meta_df = pd.merge(df, prec_datetime_19, left_on='tpep_pickup_datetime', right_on='dt_iso').drop('dt_iso',axis=1)
prec_demand = meta_df.shape[0]
ratio = prec_demand/tot_demand
prec_demand_ratio_19.append(ratio)
# plot the sum of precipitation vs ratio between prec_demand and tot_demand for each month in 2016
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(months, prec_demand_ratio_16, 'k-')
ax2.plot(months, annual_prec_16, 'b--')
ax1.set_xlabel('Months')
ax1.set_ylabel('Total taxi demand 2016', color='k')
ax2.set_ylabel('Annual precipitation 2016', color='b')
plt.title('Annual precipitation vs Total taxi demand in 2016')
plt.savefig('plots/Annual_precipitation_vs_Total_taxi_demand_in_2016.png')
plt.show()
# plot the sum of precipitation vs ratio between prec_demand and tot_demand for each month in 2019
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()
ax1.plot(months, taxi_demand_19, 'k-')
ax2.plot(months, prec_demand_ratio_19, 'b--')
ax1.set_xlabel('Months')
ax1.set_ylabel('Total taxi demand 2019', color='k')
ax2.set_ylabel('Annual precipitation 2019', color='b')
plt.title('Annual precipitation vs Total taxi demand in 2019')
plt.savefig('plots/Annual_precipitation_vs_Total_taxi_demand_in_2019.png')
plt.show()
%%time
# the distance_series is a series contains all distance for a trip without precipitation
# the prec_distance_series is a series contains all distance for a trip with precipitation
distance_series = pd.Series()
prec_distance_series = pd.Series()
for month in months:
# open the file and select rows with trip_distance between 0 and 10
df = pd.read_feather(f'yellow_2019/yellow_tripdata_2019-{month}.feather').drop('index', axis=1)[['tpep_pickup_datetime','trip_distance']]
df = df.loc[(df['trip_distance']>0) & (df['trip_distance']<10)]
# merge the df with whole weather dataset for 2019
meta_df = pd.merge(df, weather_19, left_on='tpep_pickup_datetime', right_on='dt_iso').drop('dt_iso',axis=1)
# assign the trip_distance in each case to variable
distance_with_prec = meta_df.loc[meta_df['tot_prec']>0]['trip_distance']
distance_without_prec = meta_df.loc[meta_df['tot_prec']==0]['trip_distance']
# append the variable to the distance series
distance_series = distance_series.append(distance_without_prec)
prec_distance_series = prec_distance_series.append(distance_with_prec)
# plot the distribution of trip distance for trips without precipitation in 2019
sns.distplot(distance_series, bins=300,label= 'Without PREC', color = 'red')
plt.title('Distribution of trip distance for trips without precipitation in 2019')
plt.legend()
plt.savefig('plots/Distribution_of_trip_distance_for_trips_without_precipitation_in_2019.png')
plt.show()
# print the test statistic for analysing
print("mean of trip distance without PREC: {:.2f}".format(distance_series.mean()))
print("variance of trip distance without PREC: {:.2f}".format(distance_series.var()))
# plot the distribution of trip distance for trips with precipitation in 2019
sns.distplot(prec_distance_series, bins=300,label='With PREC', color = 'blue')
plt.title('Distribution of trip distance for trips with precipitation in 2019')
plt.legend()
plt.savefig('plots/Distribution_of_trip_distance_for_trips_with_precipitation_in_2019.png')
plt.show()
print("mean of trip distance with PREC: {:.2f}".format(prec_distance_series.mean()))
print("variance of trip distance with PREC: {:.2f}".format(prec_distance_series.var()))
%%time
# df_lst is a list of dataframe for each month with only temperature and trip_distance
# In order to save time and gives enough sample between different temperature to make the statistic more confident,
# we need to convert temperature to int
df_lst = []
weather_19['temp_int'] = weather_19['avg_temp'].astype(int)
for month in months:
# read the file and select needed features
df = pd.read_feather(f'yellow_2019/yellow_tripdata_2019-{month}.feather').drop('index', axis=1)[['tpep_pickup_datetime', 'trip_distance']]
# merge df with weather_19 using the datetime, drop these two columns after merging
meta_df = pd.merge(df, weather_19[['dt_iso','temp_int']], left_on='tpep_pickup_datetime', right_on='dt_iso').drop(['dt_iso','tpep_pickup_datetime'],axis=1)
# append this dataframe to the df_lst
df_lst.append(meta_df)
# concatenate all df in the df_lst to form a giant df with trip distance and temperature for all months
all_month_trip_dist = pd.concat(df_lst)
# group the giant dataframe by temperature using the mean distance
# reset the index to make 'temp_int' into the column
distance_temp_df =all_month_trip_dist.groupby('temp_int').mean().reset_index()
# here is what the df after groupby looks like
distance_temp_df.head(2)
# plot the scatter plot for the average trip distance for each degree Celsius
distance_temp_df.plot.scatter(x='temp_int', y='trip_distance')
plt.title('Temperature vs average trip distance for all months in 2019')
plt.savefig('plots/Temperature_vs_average_trip_distance_for_all_months_in_2019.png')
plt.show()
%%time
tip_series = pd.Series()
prec_tip_series = pd.Series()
for month in months:
df = pd.read_feather(f'yellow_2019/yellow_tripdata_2019-{month}.feather').drop('index', axis=1)[['tpep_pickup_datetime','tip_amount']]
df = df.loc[(df['tip_amount']>0) & (df['tip_amount']<10)]
meta_df = pd.merge(df, weather_19, left_on='tpep_pickup_datetime', right_on='dt_iso').drop('dt_iso',axis=1)
tip_without_prec = meta_df.loc[meta_df['tot_prec']==0]['tip_amount']
tip_with_prec = meta_df.loc[meta_df['tot_prec']>0]['tip_amount']
tip_series =tip_series.append(tip_without_prec)
prec_tip_series = prec_tip_series.append(tip_with_prec)
# plot distribution of tip amount for trips without precipitation in 2019
sns.distplot(tip_series, bins=60,label= 'Without PREC', color = 'red')
plt.title('Distribution of tip amount for trips without precipitation in 2019')
plt.legend()
plt.show()
print("mean of tip amount without PREC: {:.2f}".format(tip_series.mean()))
print("variance of tip amount without PREC: {:.2f}".format(tip_series.var()))
# plot distribution of tip amount for trips with precipitation in 2019
sns.distplot(prec_tip_series, bins=60,label= 'Without PREC', color = 'blue')
plt.title('Distribution of tip amount for trips with precipitation in 2019')
plt.legend()
plt.show()
print("mean of tip amount with PREC: {:.2f}".format(prec_tip_series.mean()))
print("variance of tip amount with PREC: {:.2f}".format(prec_tip_series.var()))
%%time
# this procedure is almost same as temperature vs average distance above, except the column used is 'tip_amount'
df_lst = []
weather_19['temp_int'] = weather_19['avg_temp'].astype(int)
for month in months:
# read the file and select needed features
df = pd.read_feather(f'yellow_2019/yellow_tripdata_2019-{month}.feather').drop('index', axis=1)[['tpep_pickup_datetime', 'tip_amount']]
# merge df with weather_19 using the datetime, drop these two columns after merging
meta_df = pd.merge(df, weather_19[['dt_iso','temp_int']], left_on='tpep_pickup_datetime', right_on='dt_iso').drop(['dt_iso','tpep_pickup_datetime'],axis=1)
# append this dataframe to the df_lst
df_lst.append(meta_df)
# concatenate all df in the df_lst to form a giant df with tip_amount and temperature for all months
all_month_tip = pd.concat(df_lst)
# group the giant dataframe by temperature using the mean tip amount
tip_temp_df =all_month_tip.groupby('temp_int').mean().reset_index()
tip_temp_df.head(2)
# plot the scatter plot for the average tip amount for each degree Celsius
tip_temp_df.plot.scatter(x='temp_int', y='tip_amount')
plt.title('Temperature vs average tip amount for all months in 2019')
plt.savefig('plots/Temperature_vs_average_tip_amount_for_all_months_in_2019.png')
plt.show()
import folium
import geopandas as gpd
# the code below is copied from 'lab02'
import geopandas as gpd
# sf stands for shape file
sf = gpd.read_file("taxi_zones/taxi_zones.shp")
zone = pd.read_csv("taxi_zones/taxi+_zone_lookup.csv")
# joining two file via ID, not on string
# Convert geometry to long/lat
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
# combine all the dataframe for all months
df_lst = []
for month in months:
# read the file and select needed features
df = pd.read_feather(f'yellow_2019/yellow_tripdata_2019-{month}.feather').drop('index', axis=1)[['tpep_pickup_datetime', 'DOLocationID']]
# merge df with weather_19 using the datetime, drop these two columns after merging
meta_df = pd.merge(df, weather_19[['dt_iso','avg_temp']], left_on='tpep_pickup_datetime', right_on='dt_iso').drop(['dt_iso','tpep_pickup_datetime'],axis=1)
# append this dataframe to the df_lst
df_lst.append(meta_df)
# concatenate all df in the df_lst to form a giant df with dropoff location ID and temperature for all months
all_month_loc_temp = pd.concat(df_lst)
all_month_loc_temp.tail(2)
# slicing the giant dataframe into 4 sub_dfs, take one with highest temperature and one with lowest temperature
fst_quantile = weather_19['avg_temp'].describe()['25%']
trd_quantile = weather_19['avg_temp'].describe()['75%']
sec_quantile = weather_19['avg_temp'].describe()['50%']
all_month_loc_high_temp = all_month_loc_temp.loc[all_month_loc_temp['avg_temp']>trd_quantile]
all_month_loc_low_temp = all_month_loc_temp.loc[all_month_loc_temp['avg_temp']<fst_quantile]
# creating a GeoDataFrame with df data and shapefile data.
gdf_high = gpd.GeoDataFrame(pd.merge(all_month_loc_high_temp, sf, left_on='DOLocationID', right_on='LocationID')).drop('DOLocationID',axis=1)
# convert gdf to JSON format
geoJSON_high = gdf_high[['LocationID','geometry']].drop_duplicates('LocationID').to_json()
# create a dictionary with 'Location' and number of trips going to that location, and take log to the number
location_dict_high = all_month_loc_high_temp.groupby(['DOLocationID']).count().reset_index()[['DOLocationID','avg_temp' ]]
location_dict_high['avg_temp']= np.log(location_dict_high['avg_temp'])
# places.csv is written by myself, it contains the name of place and it's latitude and longitude
places = pd.read_csv('places.csv')
places.head(2)
# Choropleth Plot - high temperature
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
# plot the point in the places.csv on the map
for point in range(0, len(places)):
location = [places['Latitude'][point], places['Longitude'][point]]
folium.Marker(location, popup=places['Place'][point]).add_to(m)
# plot the choropleth and also assign different colors to different regions by the dictionary created before
folium.Choropleth(
geo_data=geoJSON_high,
data = location_dict_high,
name='choropleth',
columns = ['DOLocationID','avg_temp' ],
key_on='feature.properties.LocationID',
fill_color='BuPu',
fill_opacity=1,
line_opacity=1,
legend_name='Count (%)'
).add_to(m)
m.save('choropleth_plots/ChoroplethMap_high_temperature.html')
m
# creating a GeoDataFrame with df data and shapefile data.
gdf_low = gpd.GeoDataFrame(pd.merge(all_month_loc_low_temp, sf, left_on='DOLocationID', right_on='LocationID')).drop('DOLocationID',axis=1)
# convert gdf to JSON format
geoJSON_low = gdf_low[['LocationID','geometry']].drop_duplicates('LocationID').to_json()
# create a dictionary with 'Location' and number of trips going to that location, and take log to the number
location_dict_low = all_month_loc_low_temp.groupby(['DOLocationID']).count().reset_index()[['DOLocationID','avg_temp' ]]
location_dict_low['avg_temp']= np.log(location_dict_low['avg_temp'])
# Choropleth Plot - low temperature
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
# plot the point in the places.csv on the map
for point in range(0, len(places)):
location = [places['Latitude'][point], places['Longitude'][point]]
folium.Marker(location, popup=places['Place'][point]).add_to(m)
# plot the choropleth and also assign different colors to different regions by the dictionary created before
folium.Choropleth(
geo_data=geoJSON_low,
data = location_dict_low,
name='choropleth',
columns = ['DOLocationID','avg_temp' ],
key_on='feature.properties.LocationID',
fill_color='BuPu',
fill_opacity=1,
line_opacity=1,
legend_name='Count (%)'
).add_to(m)
m.save('choropleth_plots/ChoroplethMap_low_temperature.html')
m
# combine all the dataframe for all months
df_lst = []
for month in months:
# read the file and select needed features
df = pd.read_feather(f'yellow_2019/yellow_tripdata_2019-{month}.feather').drop('index', axis=1)[['tpep_pickup_datetime', 'DOLocationID']]
# merge df with weather_19 using the datetime, drop these two columns after merging
meta_df = pd.merge(df, weather_19[['dt_iso','tot_prec']], left_on='tpep_pickup_datetime', right_on='dt_iso').drop(['dt_iso','tpep_pickup_datetime'],axis=1)
# append this dataframe to the df_lst
df_lst.append(meta_df)
# concatenate all df in the df_lst to form a giant df with dropoff location ID and temperature for all months
all_month_loc_prec = pd.concat(df_lst)
all_month_loc_prec.tail(2)
# slicing the giant dataframe by whether it has any precipitation or not
all_month_loc_wth_prec = all_month_loc_prec.loc[all_month_loc_prec['tot_prec']>0]
all_month_loc_wo_prec = all_month_loc_prec.loc[all_month_loc_prec['tot_prec']==0]
# creating a GeoDataFrame with df data and shapefile data.
gdf_wo_prec = gpd.GeoDataFrame(pd.merge(all_month_loc_wo_prec, sf, left_on='DOLocationID', right_on='LocationID')).drop('DOLocationID',axis=1)
# convert gdf to JSON format
geoJSON_wo_prec = gdf_wo_prec[['LocationID','geometry']].drop_duplicates('LocationID').to_json()
# create a dictionary with 'Location' and number of trips going to that location, and take log to the number
location_dict_wo_prec = all_month_loc_wo_prec.groupby(['DOLocationID']).count().reset_index()
location_dict_wo_prec['tot_prec']= np.log(location_dict_wo_prec['tot_prec'])
# Choropleth Plot - without precipitation
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
# plot the point in the places.csv on the map
for point in range(0, len(places)):
location = [places['Latitude'][point], places['Longitude'][point]]
folium.Marker(location, popup=places['Place'][point]).add_to(m)
# plot the choropleth and also assign different colors to different regions by the dictionary created before
folium.Choropleth(
geo_data=geoJSON_wo_prec,
data = location_dict_wo_prec,
name='choropleth',
columns = ['DOLocationID','tot_prec' ],
key_on='feature.properties.LocationID',
fill_color='BuPu',
fill_opacity=1,
line_opacity=1,
legend_name='Count (%)'
).add_to(m)
m.save('choropleth_plots/ChoroplethMap_without_precipitation.html')
m
# creating a GeoDataFrame with df data and shapefile data.
gdf_wth_prec = gpd.GeoDataFrame(pd.merge(all_month_loc_wth_prec, sf, left_on='DOLocationID', right_on='LocationID')).drop('DOLocationID',axis=1)
# convert gdf to JSON format
geoJSON_wth_prec = gdf_wth_prec[['LocationID','geometry']].drop_duplicates('LocationID').to_json()
# create a dictionary with 'Location' and number of trips going to that location, and take log to the number
location_dict_wth_prec = all_month_loc_wth_prec.groupby(['DOLocationID']).count().reset_index()
location_dict_wth_prec['tot_prec']= np.log(location_dict_wth_prec['tot_prec'])
# Choropleth Plot - with precipitation
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
# plot the point in the places.csv on the map
for point in range(0, len(places)):
location = [places['Latitude'][point], places['Longitude'][point]]
folium.Marker(location, popup=places['Place'][point]).add_to(m)
# plot the choropleth and also assign different colors to different regions by the dictionary created before
folium.Choropleth(
geo_data=geoJSON_wth_prec,
data = location_dict_wth_prec,
name='choropleth',
columns = ['DOLocationID','tot_prec' ],
key_on='feature.properties.LocationID',
fill_color='BuPu',
fill_opacity=1,
line_opacity=1,
legend_name='Count (%)'
).add_to(m)
m.save('choropleth_plots/ChoroplethMap_with_precipitation.html')
m